Read in the data

Due to the inconsistent column naming covention, we manually convert the cx column to cX in an effort to reduce obfuscation.

# Read in the data
data <- read.csv('Greatest_Aussie_Groceries_sales_data.csv', header=TRUE, sep=",")
# Change column name of cx to match style of capital X and Y
colnames(data)[colnames(data)=="cx"] <- "cX"

Mutate Data

We mutate the data to include columns for deal_feat (an indictator for both deal and features), revenue, and profit.

# Append a deal_feat column for X and Y
data <- mutate(data, deal_feat_Y = deal_Y*10 + feat_Y, deal_feat_X = deal_X*10 + feat_X)
# Append a revenue column for X and Y 
data <- mutate(data, rev_X = oz_X * pX, rev_Y = oz_Y * pY)
# Append a profit column for X and Y 
data <- mutate(data, profit_X = rev_X - cX * pX, profit_Y = rev_Y - cY * pY)

Summary Plots

Lets start by just plotting the profit over the 52 weeks for each store.

plotStore <- function(STORE) {
# The palette with black:
cbbPalette <- c("red4", "orange1", "blue4", "skyblue1", "green4", "chartreuse1", "darkorchid4", "mediumorchid1")
# Pull data into temp results dataframe
results <- data.frame(WEEK=c(1:52))
results[,c("profit_X_org","profit_Y_org")] <- data %>% filter(.,STORE==STORE, class=="organic") %>% select(.,profit_X, profit_Y)
results[,c("profit_X_non","profit_Y_non")] <- data %>% filter(.,STORE==STORE, class=="nonorganic") %>% select(.,profit_X, profit_Y)
results[,c("profit_X_org_col","profit_Y_org_col")] <- data %>% filter(.,STORE==STORE, class=="organic") %>% select(.,deal_X, deal_Y)
results[,c("profit_X_non_col","profit_Y_non_col")] <- data %>% filter(.,STORE==STORE, class=="nonorganic") %>% select(.,deal_X, deal_Y)
# Assign legend name to categorical data
results$profit_X_org_col[results$profit_X_org_col == 0] <- "X organic w/o deal"
results$profit_X_org_col[results$profit_X_org_col == 1] <- "X organic w/ deal"
results$profit_Y_org_col[results$profit_Y_org_col == 0] <- "Y organic w/o deal"
results$profit_Y_org_col[results$profit_Y_org_col == 1] <- "Y organic w deal"
results$profit_X_non_col[results$profit_X_non_col == 0] <- "X nonorganic w/o deal"
results$profit_X_non_col[results$profit_X_non_col == 1] <- "X nonorganic w deal"
results$profit_Y_non_col[results$profit_Y_non_col == 0] <- "Y nonorganic w/o deal"
results$profit_Y_non_col[results$profit_Y_non_col == 1] <- "Y nonorganic w deal"
# Plot results
ggplot(results, aes(x=WEEK)) + 
  geom_point(aes(y=profit_X_org, colour=profit_X_org_col)) +
  geom_point(aes(y=profit_Y_org, colour=profit_Y_org_col)) + 
  geom_point(aes(y=profit_X_non, colour=profit_X_non_col)) +
  geom_point(aes(y=profit_Y_non, colour=profit_Y_non_col)) + 
  scale_colour_manual(values=cbbPalette) +
  labs(x = "Week", y = "Profit", title = paste("Profit for Store",STORE)) 
}

# Plot the data
plotStore(1)

plotStore(2)

plotStore(3)

plotStore(4)

plotStore(5)

plotStore(6)

plotStore(7)

Box plots of profit for the different products

This is just a simple box plot analysis

# Function to retreve data from dataframe and composite into factor
retrieveData <- function(data, deal, CLASS, xy) {
  names <- c("STORE", "PROFIT")
  if (xy == "x")
    temp <- data %>% filter(deal_X==deal, class==CLASS) %>% select(STORE, profit_X)
   else
    temp <- data %>% filter(deal_Y==deal, class==CLASS) %>% select(STORE, profit_Y)
  colnames(temp) <- names
  name <- if (xy == "x") "X" else "Y"
  name <- if (CLASS == "organic") paste(name,"O",sep="") else paste(name,"I",sep="") 
  name <- if (deal == 1) paste(name,"deal",sep="_") else paste(name,"no_deal",sep="_") 
  return(data.frame(type=rep(name,nrow(temp)),temp))
}

boxplot_data <- retrieveData(data, deal=0, CLASS="organic", xy="x")
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=0, CLASS="nonorganic", xy="x"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=0, CLASS="organic", xy="y"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=0, CLASS="nonorganic", xy="y"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=1, CLASS="organic", xy="x"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=1, CLASS="nonorganic", xy="x"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=1, CLASS="organic", xy="y"))
boxplot_data <- rbind(boxplot_data,retrieveData(data, deal=1, CLASS="nonorganic", xy="y"))

ggplot(boxplot_data, aes(x=type, y=PROFIT)) + geom_boxplot() + labs(title="Boxplots of profit for different products", x="Product", y = "Profit")

Given the massive spread between no deal and deal data, let’s take a log10() scale transform of the y-axis

boxplot_data <- mutate(boxplot_data, log10PROFIT=log10(PROFIT))
ggplot(boxplot_data, aes(x=type, y=log10PROFIT)) + geom_boxplot() + labs(title="Boxplots of log10(profit) for different products", x="Product", y = "log10(Profit)")

It is clear from these boxplots that the median values for X and Y products are approximately equal. The spread of profit for Y is much larger than X. Profit increases dramatically when a deal is going on.

Just as an additional spam of figures, lets look at the box plots for each product seperated by store (again with a log10 scaling)

boxplot_data %>% filter(type=="XO_no_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "XO_no_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="XI_no_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "XI_no_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="YO_no_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "YO_no_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="YI_no_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "YI_no_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="XO_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "XO_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="XI_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "XI_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="YO_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "YO_deal" ,"for different stores"), x="Store", y = "Profit")

boxplot_data %>% filter(type=="YI_deal") %>% ggplot(., aes(x=STORE, y=PROFIT)) + geom_boxplot(aes(group = cut_width(STORE, 1))) + labs(title=paste("Profit of", "YI_deal" ,"for different stores"), x="Store", y = "Profit")

Regression

# Function that pulls out the data
retrieveDataLM <- function(data, CLASS, xy) {
  names <- c("STORE", "oz", "p", "deal")
  if (xy == "x")
    temp <- data %>% filter(.,class==CLASS) %>% select(.,STORE, oz_X, pX, deal_X)
  else
    temp <- data %>% filter(class==CLASS) %>% select(STORE, oz_Y, pY, deal_Y)
  colnames(temp) <- names
  name <- if (xy == "x") "X" else "Y"
  name <- if (CLASS == "organic") paste(name,"O",sep="") else paste(name,"I",sep="") 
  return(data.frame(type=rep(name,nrow(temp)),temp))
}

# Function that plots the loglog and actual curves
filteredLM <- function(data, TYPE, log=FALSE, include_Deal=FALSE) {
  response <- data %>% filter(type==TYPE) %>% select(oz) %>% {if (log) log(.) else (.)} %>% as.matrix()
  elasticity <- data %>% filter(type==TYPE) %>% select(p) %>% {if (log) log(.) else (.)} %>% as.matrix()
  deal <- data %>% filter(type==TYPE) %>% select(deal) %>% {if (include_Deal) (.) else (.)*0} %>% as.matrix()
  return(lm(response~elasticity+deal))
}

# Function to plot Prediction with actual data
plotStatsLogLog <- function(lm, type="loglog", title="") {
  results <- data.frame(p=lm$model$elasticity, oz=lm$model$response, resid=resid(lm))
  results <- mutate(results, loglogSol=lm$coefficients[1] + lm$coefficients[2]*p + if (is.na(lm$coefficients[3])) 0 else lm$coefficients[3]*lm$model$deal)
  results$sol <- exp(results$loglogSol)
  if (type == "loglog")
    ggplot(results) + geom_point(aes(x=p, y=oz)) + geom_line(aes(x=p, y=loglogSol)) + labs(x="log(p)", y="log(oz)", title=title)
  else
    ggplot(results) + geom_point(aes(x=exp(p), y=exp(oz))) + geom_line(aes(x=exp(p), y=sol))  + labs(x="p", y="oz", title=title)
}



# Function that generates the inear model
lm_data <- retrieveDataLM(data, CLASS="organic", xy="x")
lm_data <- rbind(lm_data,retrieveDataLM(data, CLASS="nonorganic", xy="x"))
lm_data <- rbind(lm_data,retrieveDataLM(data, CLASS="organic", xy="y"))
lm_data <- rbind(lm_data,retrieveDataLM(data, CLASS="nonorganic", xy="y"))

Log Regression for Organic X without deal dummy variable

Linear regression for \(\log(oz_{x-org}) = \log(k) + r \log(p_{x-org})\)

# Linear Model Summary for Organic X product without including the deal dummy variable
XO_lm_without_deal <- filteredLM(lm_data, TYPE="XO", log=TRUE, include_Deal=FALSE)
summary(XO_lm_without_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65001 -0.23331 -0.06901  0.14514  1.17942 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -16.8432     0.5183  -32.49   <2e-16 ***
## elasticity   -7.0398     0.1445  -48.73   <2e-16 ***
## deal              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3749 on 362 degrees of freedom
## Multiple R-squared:  0.8677, Adjusted R-squared:  0.8674 
## F-statistic:  2375 on 1 and 362 DF,  p-value: < 2.2e-16
plotStatsLogLog(XO_lm_without_deal, "loglog", "loglog PED for Organic X without deal")

plotStatsLogLog(XO_lm_without_deal, "normal", "PED for Organic X without deal")

Log Regression for Organic X with deal dummy variable

Linear regression for \(\log(oz_{x-org}) = \log(k) + r \log(p_{x-org}) + deal_{x-org}\)

# Linear Model Summary for Organic X product including deal dummy variable
XO_lm_with_deal <- filteredLM(lm_data, TYPE="XO", log=TRUE, include_Deal=TRUE)
summary(XO_lm_with_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48496 -0.12887 -0.03184  0.08066  1.21545 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.16811    0.71250  -4.446 1.16e-05 ***
## elasticity  -3.12964    0.20256 -15.450  < 2e-16 ***
## deal         1.39566    0.06387  21.851  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2463 on 361 degrees of freedom
## Multiple R-squared:  0.9431, Adjusted R-squared:  0.9427 
## F-statistic:  2989 on 2 and 361 DF,  p-value: < 2.2e-16
plotStatsLogLog(XO_lm_with_deal, "loglog", "loglog PED for Organic X with deal")

plotStatsLogLog(XO_lm_with_deal, "normal", "PED for Organic X with deal")

Log Regression for Organic Y without deal dummy variable

Linear regression for \(\log(oz_{y-org}) = \log(k) + r \log(p_{y-org})\)

# Linear Model Summary for Organic Y product without including the deal dummy variable
YO_lm_without_deal <- filteredLM(lm_data, TYPE="YO", log=TRUE, include_Deal=FALSE)
summary(YO_lm_without_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.85175 -0.65452  0.03862  0.62843  2.76432 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.7803     1.2829  -12.30   <2e-16 ***
## elasticity   -6.7308     0.3595  -18.72   <2e-16 ***
## deal              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9754 on 362 degrees of freedom
## Multiple R-squared:  0.4919, Adjusted R-squared:  0.4905 
## F-statistic: 350.5 on 1 and 362 DF,  p-value: < 2.2e-16
plotStatsLogLog(YO_lm_without_deal, "loglog", "loglog PED for Organic Y without deal")

plotStatsLogLog(YO_lm_without_deal, "normal", "PED for Organic Y without deal")

Log Regression for Organic Y with deal dummy variable

Linear regression for \(\log(oz_{y-org}) = \log(k) + r \log(p_{y-org}) + deal_{y-org}\)

# Linear Model Summary for Organic Y product including deal dummy variable
YO_lm_with_deal <- filteredLM(lm_data, TYPE="YO", log=TRUE, include_Deal=TRUE)
summary(YO_lm_with_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.79111 -0.65105  0.03551  0.60161  2.80730 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.9350     2.3642  -2.510   0.0125 *  
## elasticity   -3.9058     0.6740  -5.795 1.49e-08 ***
## deal          1.1975     0.2445   4.897 1.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9458 on 361 degrees of freedom
## Multiple R-squared:  0.5236, Adjusted R-squared:  0.5209 
## F-statistic: 198.4 on 2 and 361 DF,  p-value: < 2.2e-16
plotStatsLogLog(YO_lm_with_deal, "loglog", "loglog PED for Organic Y with deal")

plotStatsLogLog(YO_lm_with_deal, "normal", "PED for Organic Y with deal")

Log Regression for Nonorganic X without deal dummy variable

Linear regression for \(\log(oz_{x-nonorg}) = \log(k) + r \log(p_{x-nonorg})\)

# Linear Model Summary for Nonorganic X product without including the deal dummy variable
XI_lm_without_deal <- filteredLM(lm_data, TYPE="XI", log=TRUE, include_Deal=FALSE)
summary(XI_lm_without_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62446 -0.23014 -0.09199  0.08494  1.27804 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.8704     0.5801  -27.36   <2e-16 ***
## elasticity   -6.7511     0.1630  -41.43   <2e-16 ***
## deal              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3569 on 362 degrees of freedom
## Multiple R-squared:  0.8258, Adjusted R-squared:  0.8253 
## F-statistic:  1716 on 1 and 362 DF,  p-value: < 2.2e-16
plotStatsLogLog(XI_lm_without_deal, "loglog", "loglog PED for Nonorganic X without deal")

plotStatsLogLog(XI_lm_without_deal, "normal", "PED for Nonrganic X without deal")

Log Regression for Nonorganic X with deal dummy variable

Linear regression for \(\log(oz_{x-nonorg}) = \log(k) + r \log(p_{x-nonorg}) + deal_{x-nonorg}\)

# Linear Model Summary for Nonorganic X product including deal dummy variable
XI_lm_with_deal <- filteredLM(lm_data, TYPE="XI", log=TRUE, include_Deal=TRUE)
summary(XI_lm_with_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52573 -0.10589 -0.00324  0.06946  1.30459 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.27234    0.60919  -5.372  1.4e-07 ***
## elasticity  -3.15079    0.17312 -18.200  < 2e-16 ***
## deal         1.40060    0.05548  25.243  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2149 on 361 degrees of freedom
## Multiple R-squared:  0.937,  Adjusted R-squared:  0.9367 
## F-statistic:  2685 on 2 and 361 DF,  p-value: < 2.2e-16
plotStatsLogLog(XI_lm_with_deal, "loglog", "loglog PED for Nonorganic X with deal")

plotStatsLogLog(XI_lm_with_deal, "normal", "PED for Nonorganic X with deal")

Log Regression for Nonorganic Y without deal dummy variable

Linear regression for \(\log(oz_{y-nonorg}) = \log(k) + r \log(p_{y-nonorg})\)

# Linear Model Summary for Nonorganic Y product without including the deal dummy variable
YI_lm_without_deal <- filteredLM(lm_data, TYPE="YI", log=TRUE, include_Deal=FALSE)
summary(YI_lm_without_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5546 -0.6087  0.0357  0.6109  3.4261 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.4133     1.2595  -12.24   <2e-16 ***
## elasticity   -6.6570     0.3529  -18.86   <2e-16 ***
## deal              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9586 on 362 degrees of freedom
## Multiple R-squared:  0.4957, Adjusted R-squared:  0.4943 
## F-statistic: 355.8 on 1 and 362 DF,  p-value: < 2.2e-16
plotStatsLogLog(YI_lm_without_deal, "loglog", "loglog PED for Nonorganic Y without deal")

plotStatsLogLog(YI_lm_without_deal, "normal", "PED for Nonorganic Y without deal")

Log Regression for Nonorganic Y with deal dummy variable

Linear regression for \(\log(oz_{y-nonorg}) = \log(k) + r \log(p_{y-nonorg}) + deal_{y-nonorg}\)

# Linear Model Summary for Nonorganic Y product including deal dummy variable
YI_lm_with_deal <- filteredLM(lm_data, TYPE="YI", log=TRUE, include_Deal=TRUE)
summary(YI_lm_with_deal)
## 
## Call:
## lm(formula = response ~ elasticity + deal)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46784 -0.56287  0.02177  0.56931  3.02660 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.3106     2.3526  -0.982    0.327    
## elasticity   -2.8956     0.6712  -4.314 2.07e-05 ***
## deal          1.5425     0.2386   6.464 3.31e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9087 on 361 degrees of freedom
## Multiple R-squared:  0.548,  Adjusted R-squared:  0.5455 
## F-statistic: 218.8 on 2 and 361 DF,  p-value: < 2.2e-16
plotStatsLogLog(YI_lm_with_deal, "loglog", "loglog PED for Nonorganic Y with deal")

plotStatsLogLog(YI_lm_with_deal, "normal", "PED for Nonorganic Y with deal")